{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install spacy\n",
    "# !pip install spaCy  -i https://pypi.tuna.tsinghua.edu.cn/simple\n",
    "!python -m spacy download en_core_web_sm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "doc = nlp(u'This is a sentence.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1.tokenize功能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'str'> i <class 'spacy.tokens.token.Token'> i\n",
      "<class 'str'> am <class 'spacy.tokens.token.Token'> am\n",
      "<class 'str'> a <class 'spacy.tokens.token.Token'> a\n",
      "<class 'str'> nice <class 'spacy.tokens.token.Token'> nice\n",
      "<class 'str'> baby <class 'spacy.tokens.token.Token'> baby\n",
      "<class 'str'> ! <class 'spacy.tokens.token.Token'> !\n"
     ]
    }
   ],
   "source": [
    "txt = \"i am a nice baby!\"\n",
    "tks = nlp.tokenizer(txt)\n",
    "for t in tks:\n",
    "    print(type(t.text), t.text, type(t), t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This\n",
      "is\n",
      "a\n",
      "sentence\n",
      ".\n"
     ]
    }
   ],
   "source": [
    "for token in doc:\n",
    "    print(token)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.词干化（Lemmatize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This this 1995909169258310477\n",
      "is be 10382539506755952630\n",
      "a a 11901859001352538922\n",
      "sentence sentence 18108853898452662235\n",
      ". . 12646065887601541794\n"
     ]
    }
   ],
   "source": [
    "for token in doc:\n",
    "    print(token, token.lemma_, token.lemma)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.词性标注(POS Tagging)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This DET 90\n",
      "is AUX 87\n",
      "a DET 90\n",
      "sentence NOUN 92\n",
      ". PUNCT 97\n"
     ]
    }
   ],
   "source": [
    "for token in doc:\n",
    "    print(token, token.pos_, token.pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.命名实体识别（NER）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for entity in doc.ents:\n",
    "    print(entity, entity.label_, entity.label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5.名词短语提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a sentence\n"
     ]
    }
   ],
   "source": [
    "for nounc in doc.noun_chunks:\n",
    "    print(nounc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pt",
   "language": "python",
   "name": "pt"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
